---
title: "Analysis"
author: "Matthew B. Platt"
date: "10/22/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)

```

## Load Data

```{r data}

load("../../Data/AnalysisData/analysis.RData")

```

## Table 1 

Table 1 provides an overview of the data.

```{r table1}

# create a small set that consists of the number of students, the number of theses submitted, and the
# number of submitted theses that were graded by subfield
over <- thesisdata %>% 
  group_by(semfield) %>%
  summarise(count = n(),
            submits = sum(submitted, na.rm=T),
            grades = sum(graded, na.rm=T))

# create the total number of students
totstudent <- sum(over$count)

# combine the summaries into table
total <- c("Total", sum(over$count), sum(over$submits), sum(over$grades))
over <- rbind(over, total)

# print the table
knitr::kable(over, col.names = c("Field", "Students", "Theses Submitted", "Theses Assessed"), align = 'lccc', padding = 2,
             caption = 'The Number of Senior Theses by Seminar Field, Submission, and Assessment')


```

## Table 2

Table 2 provides the proportion of students who have completed the methods and electives requirements prior to taking Senior Seminar.

```{r table2}

# create a smaller dataframe that calculates the proportion for prerequisites completed
path <- thesisdata %>%
  summarise(scopeprop = round(mean(scope, na.rm=T), digits = 3)*100,
            electprop = round(mean(allelect, na.rm=T), digits = 3)*100,
            preprop = round(mean(prereq, na.rm=T), digits = 3)*100,
            match = round(mean(topicmatch, na.rm = T), digits = 3)*100)

# create a row with the totals
total <- c(nrow(thesisdata), nrow(thesisdata), nrow(thesisdata),  sum(thesisdata$graded, na.rm=T))
# bind the proportion rows with the total row
path <- rbind(path, total)
# bind the data with labels
path <- cbind(c("Proportion","N"), path)

# print the table
knitr::kable(path, col.names = c("","Methods", "Electives", "Both", "Topic Match"), align = 'lcccc', padding = 2,
             caption = 'Table 2: Most students take methods but not their electives prior to the capstone.')


# making data for the footnote on lowering threshold to 3 electives instead of 4
path2 <- thesisdata %>%
  mutate(allelect = as.numeric(electives >= 3),
         prereq2 = as.numeric(electives >=3&scope == 1)) %>%
  summarise(scopeprop = round(mean(scope, na.rm=T), digits = 3),
            electprop = round(mean(allelect, na.rm=T), digits = 3),
            preprop = round(mean(prereq2, na.rm=T), digits = 3),
            match = round(mean(topicmatch, na.rm = T), digits = 3))

```

## Table 3

Table 3 summarizes the thirty-seven different courses that were used to satisfy the upper-level elective requirement over the period of study.

```{r table3}

# create a smaller dataframe that sums the total number of students, syllabi, and research by subfield
electable <- group_by(electivestack, field) %>%
  summarise(count = n(),
            students = sum(Freq, na.rm=T),
            available = sum(syllabus, na.rm=T),
            resprop = sum(research, na.rm=T)
            )

# Create a total row that sums up the columns of "electable"
total <- c("Total", sum(electable$count), sum(electable$students), sum(electable$available), sum(electable$resprop))

# Create levels for the subfield column
levels(electable$field) <- c("American", "Comparative", "English", "IR", "Law", "none", "Theory", "transfer", "Total")

# bind all of the data together into a table
electable <- rbind(electable, total)

# print the table
knitr::kable(electable, col.names = c("Field", "Courses", "Students", "Syllabus", "Research"), align = 'lcccc', padding = 2,
             caption = 'Table 3: Students gravitate towards law and comparative electives.')

```


## Table 4

Table 4 underlines this point by looking at the top ten electives by student enrollment. It was constructed by opening "electivestack.csv" in excel and sorting by "freq".

```{r}
# creating a vector of the rankings from 1-10
rank <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

# creating a vector of the course numbers in their rank order
cnum <- c("PSC 351", "ENG 265", "PSC 302", "PSC 385", "PSC 477", "PSC 464", "PSC 350", "PSC 463", "PSC 486", "PSC 322")

# creating a vector of the course names
ctitle <- c("Intro to Moot Court", "Advanced Composition", "Third World Politics", "Theories of IR", "African Politics",
            "Black Political Thought", "Race and Law", "Contemporary Theories of Justice", "Policy Ideologies", "American Congress")

# creating a vector of the number of students
cstudents <- c(49, 44, 42, 34, 32, 28, 25, 20, 20, 18)

# creating a vector of whether they had syllabi
syllabus <- c("no", NA, "no", "no", "yes", "yes", "no", NA, NA, "yes")

# combining all of the vectors into one table
electrank <- cbind(rank, cnum, ctitle, cstudents, syllabus)

# print the table
knitr::kable(electrank, col.names = c("Rank", "Course", "Title", "Students", "Research"), align = 'lcccc', padding = 2,
             caption = 'Table 4: The upper-level electives are not developing research skills')

```

## Table 5

Table 5 shows the percentage of projects that satisfy each of the seven criteria.

```{r}

# creating a smaller dataframe of the proportion of theses that satisfy each criteria
quality <- thesisdata %>% filter(graded == 1) %>%
  summarise(count = n(),
            sourceprop = mean(sourcecount)*100,
            conceptprop = mean(concepts)*100,
            litprop = mean(litcritique)*100,
            hypoprop = mean(hypothesis)*100,
            primaryprop = mean(primarydata)*100,
            dataprop = mean(dataviz)*100,
            blackprop = mean(blackpol)*100
            )

# print the table
knitr::kable(quality, digits = 3, col.names = c("Assessed", "Sources", "Concepts", "Critique", "Hypothesis",
                                                "Primary Source", "Visualization", "Black Polisci"), 
             align = 'c', padding = 2,
             caption = 'Table 5: For four out of the seven key attributes, A majority of student projects demonstrate competence.')

```

## Table 6

Table 6 reinforces the message of Figure 4 by showing the percentage of students who satisfy each criterion by the subfield of the Senior Seminar. A student's choice of seminar overwhelms the potential benefits or deficits of his prior curricular choices.

```{r}

# create summary dataframe that has the percentage of theses that satisfy each criteria by the type of 
# seminar
quality2 <- thesisdata %>% filter(graded == 1) %>%
  group_by(semfield) %>%
  summarise(
            sourceprop = mean(sourcecount)*100,
            conceptprop = mean(concepts)*100,
            litprop = mean(litcritique)*100,
            hypoprop = mean(hypothesis)*100,
            primaryprop = mean(primarydata)*100,
            dataprop = mean(dataviz)*100,
            blackprop = mean(blackpol)*100
            )

# print the table
knitr::kable(quality2, digits = 3,
             col.names = c("Field", "Sources", "Concepts", "Critique", "Hypothesis", "Primary Source", "Visualization", "Black Polisci"),
             align = 'lcccccccc', padding = 2,
             caption = 'Table 6: The American Politics seminars meet the criteria at a higher level than other fields')

```

## Table 7

Table 7 presents a simple linear regression of students' thesis grades (scored on a 100-point scale) on whether they had taken methods prior to the capstone, the number of upper-level electives taken prior to the capstone, and their cumulative GPA. Consistent with the other results, the curricular path does not have an effect. 

```{r}

# regression of score
m1 <- lm(score ~ scope + gpa + electives + semfield, data = thesisdata)

# regression of source
m2 <- lm(sources ~ scope + gpa + electives + semfield, data = thesisdata)

# regression of concepts
m3 <- glm(concepts ~ scope + gpa + electives + semfield, data = thesisdata, family = "quasibinomial")

# regression of critique
m4 <- glm(litcritique ~ scope + gpa + electives + semfield, data = thesisdata, family = "quasibinomial")

# regression of hypothesis
m5 <- glm(hypothesis ~ scope + gpa + electives + semfield, data = thesisdata, family = "quasibinomial")

# regression of primary source/data
m6 <- glm(primarydata ~ scope + gpa + electives + semfield, data = thesisdata, family = "quasibinomial")

#regression of dataviz
m7 <- glm(dataviz ~ scope + gpa + electives + semfield, data = thesisdata, family = "quasibinomial")

# regression of thesis grade
m8 <- lm(thesis ~ scope + electives + gpa, data = thesisdata)

# print the table of regression results
stargazer::stargazer(m8, title = "Research grades are driven by GPA and not curriculum.",
                     dep.var.labels = c("Final Project Grade"), 
                     covariate.labels = c("Methods prior to Capstone", "number of Electives", "Cumulative GPA"),
                     header = FALSE, output = 'latex')

```


## Figure 1

Figure 1 presents a histogram of summed scores for the senior thesis projects. Most theses contain three or fewer of the desired attributes.

```{r}

# create version of the data that excludes theses that were not available for coding/grading
cleanthesis <- filter(thesisdata, graded == 1)

# make a histogram of "score"
p3 <- ggplot(data = cleanthesis, aes(x = score)) + 
  geom_histogram()
p3

```

## Figure 2

Figure 2 shows the average score for the thesis by whether the student had taken Scope and Methods prior to taking Senior Seminar. Completing the Scope prerequisite does not affect the quality of capstone projects.

```{r}

# plot scope vs. thesis
# transform the "scope" variable into a factor for whether the student passed Scope and Methods 
# prior to taking Senior Seminar
thesisdata1 <- thesisdata
thesisdata1$scopefct <- as.factor(thesisdata1$scope)
levels(thesisdata1$scopefct) <- c("No", "Yes")

# create a dataframe that combines the mean number of attributes in a thesis by the Scope prerequisite
meanscore1 <- filter(thesisdata1, is.na(score)==F) %>%
  group_by(scopefct) %>%
  summarise(avgscore = mean(score))
  
# make a barchart 
p4 <- ggplot(meanscore1, aes(y = avgscore, x = scopefct, fill = scopefct)) +
  geom_col() + 
  labs(x = "Completed Methods", y = "Average # of Attributes", fill = "Methods") +
  theme(legend.position = "none")
p4

```

## Figure 3

Figures 3 illustrate that completing the intended electives does not affect the quality of capstone projects.

```{r}

# boxplot thesis v. electives
# create an ordinal version of the electives variable
thesisdata1$electord <- as.ordered(thesisdata1$electives)

# subset that only includes theses that were graded
thesisdata2 <- filter(thesisdata1, graded == 1)

# make the boxplot of thesis scores by the number of electives taken
p5 <- ggplot(thesisdata2, aes(x = electord, y = score, fill = electord)) +
  geom_boxplot() + labs(x = "# of Electives", y = "# of Attributes", fill = "Electives") +
  theme(legend.position = "none")
p5

```

## Figure 4

Figure 4 plots the average attribute score for capstone projects by academic year and seminar subfield. On average, research projects in the American Politics senior seminar meet more of our criteria than those in the other subfields. 

```{r}

# subset the summary data so it only includes theses that were gradedp6
sumthesis1 <- filter(sumthesis1, grades > 0)

# plot year, score, instructor, and term
p6 <- ggplot(data = sumthesis1, aes(x = ayear, y = avgscore, size = grades,
                                    shape = semfield, color = semfield)) +
  geom_point() +
  labs(x = "Academic Year", y = "Average # of Attributes", shape = "Field", color = "Field", size = "Assessed")
p6

```



